# -*- coding: utf-8 -*-
#因为新浪新闻网在这边被禁了，所以换成了中国新闻网
import scrapy
from homework.items import NewsItem

class ChinanewsSpider(scrapy.Spider):
    name = 'chinanews'
    allowed_domains = ['chinanews.com']
    start_urls = ['http://www.chinanews.com/common/footer/sitemap.shtml']
    #allowed_domains = ['fang.5i5j.com']
    #start_urls = ['https://fang.5i5j.com/bj/loupan/']

    def parse(self, response):
        hlist = response.css("div.list")
        print(response.status)
        for vo in hlist:
            for item in vo.css("ul li"):
                items =  NewsItem()
                #print(item.css("a::text").extract_first())
                #print('https:' + item.css("a::attr(href)").extract_first())
                items['title'] = item.css("a::text").extract_first()
                items['link'] = 'https:' + item.css("a::attr(href)").extract_first()
                print(items['title'] + ':' + items['link'])
                #print()
        #pass#Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:60.0) Gecko/20100101 Firefox/60.0
